# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import redis
import re

class MasterPipeline(object):
    def __init__(self,host,port):
        self.r=redis.Redis(host=host,port=port,decode_responses=True)

    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            host=crawler.settings.get("REDIS_HOST"),
            port=crawler.settings.get("REDIS_PORT")
            )

    def process_item(self, item, spider):
        url=item["url"]
        # 判断url是否有效，有效则写入redis
        bookid=re.findall("book.douban.com/subject/([0-9]+)/",url)
        if bookid:
            if self.r.sadd('book:id',bookid[0]):# 去除重复的图书链接
                self.r.lpush('book:start_urls',url)
        else:
            self.r.lpush('book:no_urls',url)
